source(here::here("src/funs.R"))
source(here::here("validation/cs_process.R"))
## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## here() starts at /Users/Lauren/Documents/R projects/OV_Histotypes
## Warning: 867 parsing failures.
## row col expected actual file
## 1362 review_grade a double n/a '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2973 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2974 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3541 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3542 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## .... .................... .......... ......... .....................................................................
## See problems(...) for more details.
cs1_class <- readRDS(here::here("data/cs1_class.rds"))
cs2_class <- readRDS(here::here("data/cs2_class.rds"))
First we validate the CS2 normalization process by looking at the distribution of CS3 non-normalized samples with:
# Dataset Split -----------------------------------------------------------
# CS2 gene order
cs2_genes <- names(cs2_clean)[-1:-2]
# Split CS2 common samples into Reference and Validation sets
# Reference set: duplicate samples
cs2_ref <- cs2_clean %>%
dplyr::filter(duplicated(ottaID) | duplicated(ottaID, fromLast = TRUE)) %>%
dplyr::select(-FileName)
# Validation set: unique samples
cs2_val <- dplyr::anti_join(cs2_clean, cs2_ref, by = "ottaID") %>%
dplyr::select(-FileName)
# No Normalization --------------------------------------------------------
# Sort CS2 validation set without normalization to preserve sample order
cs2_val_norm0 <- dplyr::arrange(cs2_val, ottaID)
# Extract CS3 samples corresponding to CS2 validation set without normalization
# Averaged gene expression within duplicates
cs3_val_norm0 <- cs3_clean %>%
dplyr::select(names(cs2_val_norm0)) %>%
dplyr::semi_join(cs2_val_norm0, by = "ottaID") %>%
dplyr::group_by(ottaID) %>%
dplyr::summarize_if(is.double, mean)
# Pool Method -------------------------------------------------------------
# Pool set: pool samples from CS2
pool_samples <-
gsub("-|\\.RCC", "", grep("POOL", pools[["CS2-FileName"]], value = TRUE))
cs2_pools <- dplyr::select(cs2_norm, Name, paste0("X", pool_samples))
# Locked down reference pools weights
weights <- c("Pool1", "Pool2", "Pool3") %>%
purrr::set_names() %>%
purrr::map_dbl(~ mean(grepl(., names(ref_pools), ignore.case = TRUE)))
# Weighted mean gene expression for CS2 pools (using reference pools weights)
cs2_pools_mgx <- weights %>%
purrr::imap_dfc(~ .x * rowMeans(dplyr::select(cs2_pools, dplyr::matches(.y)))) %>%
dplyr::transmute(Name = cs2_pools[["Name"]], cs2_exp = rowSums(.))
# Mean gene expression for CS3 reference pools
cs3_pools_mgx <-
tibble::enframe(rowMeans(ref_pools), name = "Name", value = "cs3_exp")
# Transposed validation set
cs2_val_t <- cs2_val %>%
tidyr::gather(Name, value, -1) %>%
tidyr::spread(ottaID, value)
# Normalize each gene by adding batch effect (diff in mean gx)
cs2_val_norm1 <-
dplyr::inner_join(cs3_pools_mgx, cs2_pools_mgx, by = "Name") %>%
dplyr::transmute(Name, be = cs3_exp - cs2_exp) %>%
dplyr::inner_join(cs2_val_t, by = "Name") %>%
tidyr::gather(ottaID, exp, -1:-2) %>%
dplyr::transmute(Name = factor(Name, levels = cs2_genes), ottaID, exp = be + exp) %>%
tidyr::spread(Name, exp)
# Reference Method --------------------------------------------------------
# Corresponding samples in CS3 found in CS2 reference set
cs3_ref <- cs3_clean %>%
dplyr::semi_join(cs2_ref, by = "ottaID") %>%
dplyr::select(cs2_genes)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cs2_genes)` instead of `cs2_genes` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Remove non-numeric column names
cs2_ref2 <- dplyr::select_if(cs2_ref, is.double)
cs2_val2 <- dplyr::select_if(cs2_val, is.double)
# Normalize by reference method using reference samples
cs2_val_norm2 <- nanostringr::refMethod(cs2_val2, cs2_ref2, cs3_ref) %>%
tibble::as_tibble() %>%
tibble::add_column(ottaID = cs2_val[["ottaID"]], .before = 1) %>%
dplyr::arrange(ottaID)
# Method Comparisons ------------------------------------------------------
# Compare cs2_val_norm0, cs2_val_norm1, cs2_val_norm2, with cs3_val_norm0
cs_combined <-
tibble::lst(cs2_val_norm0, cs2_val_norm1, cs2_val_norm2, cs3_val_norm0) %>%
dplyr::bind_rows(.id = "dataset") %>%
tidyr::gather(gene, exp, -1:-2, factor_key = TRUE) %>%
tidyr::spread(dataset, exp)
# Common plotting layers
n_samples <- dplyr::n_distinct(cs_combined[["ottaID"]])
layers <- list(
geom_point(alpha = 0.3),
geom_abline(slope = 1, intercept = 0, color = "blue"),
facet_wrap(~ gene, nrow = 8, ncol = 9),
labs(
y = "CS3 No Normalization",
title = paste0("CS2 Validation Set (n=", n_samples,
") vs. Corresponding CS3 Samples")
)
)
# Scatterplots of each CS2 dataset vs CS3
p0 <- ggplot(cs_combined, aes(cs2_val_norm0, cs3_val_norm0)) +
xlab("CS2 No Normalization") +
layers
print(p0)
Gene Expression CS2 No Normalization vs. CS3
p1 <- ggplot(cs_combined, aes(cs2_val_norm1, cs3_val_norm0)) +
xlab("CS2 Pools Normalization") +
layers
print(p1)
Gene Expression CS2 Pools Normalization vs. CS3
p2 <- ggplot(cs_combined, aes(cs2_val_norm2, cs3_val_norm0)) +
xlab("CS2 Reference Normalization") +
layers
print(p2)
Gene Expression CS2 Reference Normalization vs. CS3
# Concordance Histograms --------------------------------------------------
# Concordance measures between samples of same gene, for each dataset comparison
all_metrics <- cs_combined %>%
tidyr::gather(Methods, cs2_val_norm, dplyr::matches("cs2")) %>%
dplyr::mutate(
Methods = dplyr::recode_factor(
Methods,
cs2_val_norm0 = "CS2 None vs. CS3 None",
cs2_val_norm1 = "CS2 Pools vs. CS3 None",
cs2_val_norm2 = "CS2 Common vs. CS3 None"
)
) %>%
dplyr::group_by(Methods, ottaID) %>%
dplyr::summarize(
R2 = cor(cs2_val_norm, cs3_val_norm0) ^ 2,
Ca = epiR::epi.ccc(cs2_val_norm, cs3_val_norm0)[["C.b"]],
Rc = epiR::epi.ccc(cs2_val_norm, cs3_val_norm0)[["rho.c"]][["est"]]
) %>%
dplyr::ungroup() %>%
tidyr::gather(Metric, Expression, c("R2", "Ca", "Rc"), factor_key = TRUE)
## `summarise()` regrouping output by 'Methods' (override with `.groups` argument)
# Plot all combinations of cross-dataset concordance measure histograms
p <- ggplot(all_metrics, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
facet_grid(rows = vars(Methods), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "CS2 Datasets vs. CS3 Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p)
Concordance Histograms
)
The histotype distributions on the full data are shown below.
# Histotype Distributions -------------------------------------------------
all_hist <- hist %>% dplyr::filter(!is.na(revHist))
all_hist %>%
dplyr::count(CodeSet, hist_gr) %>%
tidyr::spread(CodeSet, n, fill = 0L) %>%
knitr::kable(caption = "All CodeSet Histotype Groups")
| hist_gr | CS1 | CS2 | CS3 |
|---|---|---|---|
| HGSC | 169 | 757 | 2453 |
| non-HGSC | 196 | 377 | 677 |
all_counts <- all_hist %>% dplyr::count(CodeSet, revHist)
all_counts %>%
tidyr::spread(CodeSet, n, fill = 0L) %>%
knitr::kable(caption = "All CodeSet Histotypes")
| revHist | CS1 | CS2 | CS3 |
|---|---|---|---|
| CARCINOMA-NOS | 0 | 61 | 23 |
| Carcinoma, NOS | 0 | 0 | 2 |
| CCOC | 57 | 68 | 182 |
| CCOC-MCT | 0 | 1 | 0 |
| Cell-Line | 17 | 48 | 13 |
| CTRL | 0 | 12 | 0 |
| ENOC | 61 | 30 | 272 |
| ENOC-CCOC | 0 | 7 | 0 |
| ERROR | 0 | 3 | 0 |
| HGSC | 169 | 757 | 2453 |
| HGSC-MCT | 0 | 1 | 0 |
| LGSC | 22 | 29 | 50 |
| MBOT | 0 | 20 | 3 |
| MET-NOP | 0 | 21 | 0 |
| MIXED (ENOC/CCOC) | 0 | 0 | 1 |
| MIXED (ENOC/LGSC) | 0 | 0 | 1 |
| MIXED (HGSC/CCOC) | 0 | 0 | 1 |
| mixed cell | 0 | 0 | 7 |
| MMMT | 0 | 0 | 30 |
| MUC | 20 | 61 | 77 |
| Other (use when 6, 7, or 9 is not distinguished) or unknown if epithelial | 0 | 0 | 1 |
| Other/Exclude | 0 | 0 | 8 |
| SBOT | 19 | 10 | 3 |
| Serous | 0 | 0 | 2 |
| serous LMP | 0 | 0 | 1 |
| SQAMOUS | 0 | 1 | 0 |
| UNK | 0 | 4 | 0 |
common_counts <- all_hist %>%
dplyr::filter(FileName %in% common_samples) %>%
dplyr::count(CodeSet, revHist)
common_counts %>%
tidyr::spread(CodeSet, n, fill = 0L) %>%
knitr::kable(caption = "Common Summary ID CodeSet Histotypes")
| revHist | CS1 | CS2 | CS3 |
|---|---|---|---|
| CCOC | 3 | 4 | 9 |
| Cell-Line | 4 | 5 | 5 |
| ENOC | 4 | 4 | 9 |
| HGSC | 68 | 64 | 98 |
| LGSC | 7 | 5 | 8 |
| MUC | 7 | 5 | 11 |
all_maj_counts <- all_hist %>%
dplyr::filter(revHist %in% c("HGSC", "LGSC", "ENOC", "CCOC", "MUC")) %>%
dplyr::count(CodeSet, revHist)
all_maj_counts %>%
tidyr::spread(CodeSet, n, fill = 0L) %>%
dplyr::mutate(CS1_percent = CS1 /sum(CS1) * 100, CS1_percent = round(CS1_percent,1)) %>%
dplyr::mutate(CS2_percent = CS2 /sum(CS2) * 100, CS2_percent = round(CS2_percent,1)) %>%
dplyr::mutate(CS3_percent = CS3 /sum(CS3) * 100, CS3_percent = round(CS3_percent,1)) %>%
knitr::kable(caption = "All CodeSet Major Histotypes")
| revHist | CS1 | CS2 | CS3 | CS1_percent | CS2_percent | CS3_percent |
|---|---|---|---|---|---|---|
| CCOC | 57 | 68 | 182 | 17.3 | 7.2 | 6.0 |
| ENOC | 61 | 30 | 272 | 18.5 | 3.2 | 9.0 |
| HGSC | 169 | 757 | 2453 | 51.4 | 80.1 | 80.9 |
| LGSC | 22 | 29 | 50 | 6.7 | 3.1 | 1.6 |
| MUC | 20 | 61 | 77 | 6.1 | 6.5 | 2.5 |
all_counts %>%
dplyr::filter(CodeSet == "CS1") %>%
knitr::kable(caption = "CS1 Histotypes")
| CodeSet | revHist | n |
|---|---|---|
| CS1 | CCOC | 57 |
| CS1 | Cell-Line | 17 |
| CS1 | ENOC | 61 |
| CS1 | HGSC | 169 |
| CS1 | LGSC | 22 |
| CS1 | MUC | 20 |
| CS1 | SBOT | 19 |
all_counts %>%
dplyr::filter(CodeSet == "CS2") %>%
knitr::kable(caption = "CS2 Histotypes")
| CodeSet | revHist | n |
|---|---|---|
| CS2 | CARCINOMA-NOS | 61 |
| CS2 | CCOC | 68 |
| CS2 | CCOC-MCT | 1 |
| CS2 | Cell-Line | 48 |
| CS2 | CTRL | 12 |
| CS2 | ENOC | 30 |
| CS2 | ENOC-CCOC | 7 |
| CS2 | ERROR | 3 |
| CS2 | HGSC | 757 |
| CS2 | HGSC-MCT | 1 |
| CS2 | LGSC | 29 |
| CS2 | MBOT | 20 |
| CS2 | MET-NOP | 21 |
| CS2 | MUC | 61 |
| CS2 | SBOT | 10 |
| CS2 | SQAMOUS | 1 |
| CS2 | UNK | 4 |
all_counts %>%
dplyr::filter(CodeSet == "CS3") %>%
knitr::kable(caption = "CS3 Histotypes")
| CodeSet | revHist | n |
|---|---|---|
| CS3 | CARCINOMA-NOS | 23 |
| CS3 | Carcinoma, NOS | 2 |
| CS3 | CCOC | 182 |
| CS3 | Cell-Line | 13 |
| CS3 | ENOC | 272 |
| CS3 | HGSC | 2453 |
| CS3 | LGSC | 50 |
| CS3 | MBOT | 3 |
| CS3 | MIXED (ENOC/CCOC) | 1 |
| CS3 | MIXED (ENOC/LGSC) | 1 |
| CS3 | MIXED (HGSC/CCOC) | 1 |
| CS3 | mixed cell | 7 |
| CS3 | MMMT | 30 |
| CS3 | MUC | 77 |
| CS3 | Other (use when 6, 7, or 9 is not distinguished) or unknown if epithelial | 1 |
| CS3 | Other/Exclude | 8 |
| CS3 | SBOT | 3 |
| CS3 | Serous | 2 |
| CS3 | serous LMP | 1 |
The training set distributions for CS1 and CS2 are shown below.
cs1_class %>%
tibble::enframe(value = "histotype") %>%
dplyr::count(histotype) %>%
knitr::kable(caption = "CS1 Training Set Histotypes")
| histotype | n |
|---|---|
| CCC | 57 |
| ENOCa | 59 |
| HGSC | 156 |
| LGSC | 16 |
| MUC | 16 |
cs2_class %>%
tibble::enframe(value = "histotype") %>%
dplyr::count(histotype) %>%
knitr::kable(caption = "CS2 Training Set Histotypes")
| histotype | n |
|---|---|
| CCOC | 68 |
| ENOC | 30 |
| HGSC | 757 |
| LGSC | 29 |
| MUC | 61 |
source(here::here("validation/cs_process_cohorts.R"))
## Warning: 867 parsing failures.
## row col expected actual file
## 1362 review_grade a double n/a '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2973 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2974 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3541 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3542 reviewCompletionDate date like 2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## .... .................... .......... ......... .....................................................................
## See problems(...) for more details.
# Pairwise CodeSet comparisons
codesets <- c("CS1", "CS2", "CS3")
all_codesets <- combn(codesets, 2) %>%
as_tibble() %>%
set_names(map_chr(., paste, collapse = "_vs_"))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
cs1_raw_avg <- cs1 %>%
rename_all(~ gsub("^X", "", .)) %>%
filter(Name %in% common_genes) %>%
select_if(names(.) %in% c("Name", common_samples)) %>%
mutate(Name = fct_inorder(Name)) %>%
gather(FileName, value, -Name) %>%
inner_join(hist, by = "FileName") %>%
spread(Name, value) %>%
select(FileName, ottaID, all_of(common_genes)) %>%
group_by(ottaID) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
column_to_rownames("ottaID")
cs2_raw_avg <- cs2 %>%
rename_all(~ gsub("^X", "", .)) %>%
filter(Name %in% common_genes) %>%
select_if(names(.) %in% c("Name", common_samples)) %>%
mutate(Name = fct_inorder(Name)) %>%
gather(FileName, value, -Name) %>%
inner_join(hist, by = "FileName") %>%
spread(Name, value) %>%
select(FileName, ottaID, all_of(common_genes)) %>%
group_by(ottaID) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
column_to_rownames("ottaID")
cs3_raw_avg <- cs3 %>%
rename_all(~ gsub("^X", "", .)) %>%
filter(Name %in% common_genes) %>%
select_if(names(.) %in% c("Name", common_samples)) %>%
mutate(Name = fct_inorder(Name)) %>%
gather(FileName, value, -Name) %>%
inner_join(hist, by = "FileName") %>%
spread(Name, value) %>%
select(FileName, ottaID, all_of(common_genes)) %>%
group_by(ottaID) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
column_to_rownames("ottaID")
raw_list <-
set_names(list(cs1_raw_avg, cs2_raw_avg, cs3_raw_avg), codesets)
metrics_raw <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(raw_list[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_raw <- ggplot(metrics_raw, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Raw Non-Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_raw)
cs1_hknorm_avg <- cs1_norm %>%
rename_all(~ gsub("^X", "", .)) %>%
filter(Name %in% common_genes) %>%
select_if(names(.) %in% c("Name", common_samples)) %>%
mutate(Name = fct_inorder(Name)) %>%
gather(FileName, value, -Name) %>%
inner_join(hist, by = "FileName") %>%
spread(Name, value) %>%
select(FileName, ottaID, all_of(common_genes)) %>%
group_by(ottaID) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
column_to_rownames("ottaID")
cs2_hknorm_avg <- cs2_norm %>%
rename_all(~ gsub("^X", "", .)) %>%
filter(Name %in% common_genes) %>%
select_if(names(.) %in% c("Name", common_samples)) %>%
mutate(Name = fct_inorder(Name)) %>%
gather(FileName, value, -Name) %>%
inner_join(hist, by = "FileName") %>%
spread(Name, value) %>%
select(FileName, ottaID, all_of(common_genes)) %>%
group_by(ottaID) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
column_to_rownames("ottaID")
cs3_hknorm_avg <- cs3_norm %>%
rename_all(~ gsub("^X", "", .)) %>%
filter(Name %in% common_genes) %>%
select_if(names(.) %in% c("Name", common_samples)) %>%
mutate(Name = fct_inorder(Name)) %>%
gather(FileName, value, -Name) %>%
inner_join(hist, by = "FileName") %>%
spread(Name, value) %>%
select(FileName, ottaID, all_of(common_genes)) %>%
group_by(ottaID) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
column_to_rownames("ottaID")
hknorm_list <-
set_names(list(cs1_hknorm_avg, cs2_hknorm_avg, cs3_hknorm_avg), codesets)
metrics_hknorm <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(hknorm_list[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_hknorm <- ggplot(metrics_hknorm, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "HK genes Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_hknorm)
We employ a new normalization technique using randomly selected samples common to all three CodeSets with a uniform distribution of histotypes as the reference dataset. The number of randomly selected samples ranges from 1-3 per histotype. Hence, the reference dataset has either 5, 10, or 15 samples and we validate on the remaining. Note that ottaID duplicates are collapsed by mean averaging the gene expression. n=72 common samples.
CodeSets 1 and 2 are calibrated to CodeSet3 as follows:
X^1(norm) = X^1 - R^1 + R^3
X^2(norm) = X^2 - R^2 + R^3
X^3(norm) = X^3
# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand3 <- hist %>%
filter(ottaID %in% unique(c(
cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
))) %>%
distinct(ottaID, revHist) %>%
group_by(revHist) %>%
slice_sample(n = 3) %>%
ungroup()
# Gene expression from random common samples, preserving gene order
cs1_rand3 <- join_avg(cs1_clean, hist_rand3, "ottaID", "keep")
cs2_rand3 <- join_avg(cs2_clean, hist_rand3, "ottaID", "keep")
cs3_rand3 <- join_avg(cs3_clean, hist_rand3, "ottaID", "keep")
# Remove common samples from CS1, preserving gene order
cs1_counts3 <- join_avg(cs1_clean, hist_rand3, "ottaID", "discard")
cs2_counts3 <- join_avg(cs2_clean, hist_rand3, "ottaID", "discard")
cs3_counts3 <- join_avg(cs3_clean, hist_rand3, "ottaID", "discard")
# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand3 <-
refMethod(cs1_counts3, cs3_rand3, cs1_rand3) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
cs2_norm_rand3 <-
refMethod(cs2_counts3, cs3_rand3, cs2_rand3) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
# Combined gene expression
counts3 <-
set_names(list(cs1_counts3, cs2_counts3, cs3_counts3), codesets)
norm_rand3 <- list(cs1_norm_rand3, cs2_norm_rand3, cs3_counts3) %>%
set_names(codesets) %>%
map_at(1:2, select, -"revHist")
# Concordance measures for all genes averaged across samples
metrics_non3 <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(counts3[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
metrics_rand3 <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(norm_rand3[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_non3 <- ggplot(metrics_non3, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random3 Non-Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_non3)
Random3 Non-Normalized Concordance Measure Distributions
p_rand3 <- ggplot(metrics_rand3, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random3 Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_rand3)
Random3 Normalized Concordance Measure Distributions
# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand2 <- hist %>%
filter(ottaID %in% unique(c(
cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
))) %>%
distinct(ottaID, revHist) %>%
group_by(revHist) %>%
slice_sample(n = 2) %>%
ungroup()
# Gene expression from random common samples, preserving gene order
cs1_rand2 <- join_avg(cs1_clean, hist_rand2, "ottaID", "keep")
cs2_rand2 <- join_avg(cs2_clean, hist_rand2, "ottaID", "keep")
cs3_rand2 <- join_avg(cs3_clean, hist_rand2, "ottaID", "keep")
# Remove common samples from CS1, preserving gene order
cs1_counts2 <- join_avg(cs1_clean, hist_rand2, "ottaID", "discard")
cs2_counts2 <- join_avg(cs2_clean, hist_rand2, "ottaID", "discard")
cs3_counts2 <- join_avg(cs3_clean, hist_rand2, "ottaID", "discard")
# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand2 <-
refMethod(cs1_counts2, cs3_rand2, cs1_rand2) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
cs2_norm_rand2 <-
refMethod(cs2_counts2, cs3_rand2, cs2_rand2) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
# Combined gene expression
counts2 <-
set_names(list(cs1_counts2, cs2_counts2, cs3_counts2), codesets)
norm_rand2 <- list(cs1_norm_rand2, cs2_norm_rand2, cs3_counts2) %>%
set_names(codesets) %>%
map_at(1:2, select, -"revHist")
# Concordance measures for all genes averaged across samples
metrics_non2 <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(counts2[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
metrics_rand2 <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(norm_rand2[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_non2 <- ggplot(metrics_non2, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random2 Non-Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_non2)
Random2 Non-Normalized Concordance Measure Distributions
p_rand2 <- ggplot(metrics_rand2, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 50, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random2 Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_rand2)
Random2 Normalized Concordance Measure Distributions
# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand1 <- hist %>%
filter(ottaID %in% unique(c(
cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
))) %>%
distinct(ottaID, revHist) %>%
group_by(revHist) %>%
slice_sample(n = 1) %>%
ungroup()
# Gene expression from random common samples, preserving gene order
cs1_rand1 <- join_avg(cs1_clean, hist_rand1, "ottaID", "keep")
cs2_rand1 <- join_avg(cs2_clean, hist_rand1, "ottaID", "keep")
cs3_rand1 <- join_avg(cs3_clean, hist_rand1, "ottaID", "keep")
# Remove common samples from CS1, preserving gene order
cs1_counts1 <- join_avg(cs1_clean, hist_rand1, "ottaID", "discard")
cs2_counts1 <- join_avg(cs2_clean, hist_rand1, "ottaID", "discard")
cs3_counts1 <- join_avg(cs3_clean, hist_rand1, "ottaID", "discard")
# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand1 <-
refMethod(cs1_counts1, cs3_rand1, cs1_rand1) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
cs2_norm_rand1 <-
refMethod(cs2_counts1, cs3_rand1, cs2_rand1) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
# Combined gene expression
counts1 <-
set_names(list(cs1_counts1, cs2_counts1, cs3_counts1), codesets)
norm_rand1 <- list(cs1_norm_rand1, cs2_norm_rand1, cs3_counts1) %>%
set_names(codesets) %>%
map_at(1:2, select, -"revHist")
# Concordance measures for all genes averaged across samples
metrics_non1 <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(counts1[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
metrics_rand1 <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(norm_rand1[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_non1 <- ggplot(metrics_non1, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random1 Non-Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_non1)
Random1 Non-Normalized Concordance Measure Distributions
p_rand1 <- ggplot(metrics_rand1, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random1 Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_rand1)
Random1 Normalized Concordance Measure Distributions
cs1_norm_rand1_byhist <- cs1_norm_rand1 %>%
split(.$revHist) %>%
map(select, -"revHist")
cs2_norm_rand1_byhist <- cs2_norm_rand1 %>%
split(.$revHist) %>%
map(select, -"revHist")
cs1_counts1_byhist <- cs1_counts1 %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID") %>%
split(.$revHist) %>%
map(select, -"revHist")
cs2_counts1_byhist <- cs2_counts1 %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID") %>%
split(.$revHist) %>%
map(select, -"revHist")
cs3_counts1_byhist <- cs3_counts1 %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID") %>%
split(.$revHist) %>%
map(select, -"revHist")
rand1_cs1_vs_cs3_non <- list(cs1_counts1_byhist, cs3_counts1_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Non"))
rand1_cs1_vs_cs3_norm <- list(cs1_norm_rand1_byhist, cs3_counts1_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Norm"))
rand1_cs1_vs_cs3 <-
inner_join(rand1_cs1_vs_cs3_non, rand1_cs1_vs_cs3_norm, by = "hist")
knitr::kable(rand1_cs1_vs_cs3,
caption = "Random1 CS1 vs. CS3 Median Concordance Measures by Histotypes",
digits = 2)
| hist | R2-Non | Ca-Non | Rc-Non | R2-Norm | Ca-Norm | Rc-Norm |
|---|---|---|---|---|---|---|
| CCOC | 1.00 | 0.30 | 0.08 | 1.00 | 0.28 | 0.09 |
| ENOC | 1.00 | 0.54 | 0.54 | 1.00 | 0.61 | 0.61 |
| HGSC | 0.78 | 0.98 | 0.85 | 0.78 | 0.97 | 0.86 |
| LGSC | 0.97 | 0.87 | 0.81 | 0.97 | 0.90 | 0.87 |
| MUC | 0.75 | 0.85 | 0.68 | 0.75 | 0.82 | 0.64 |
rand1_cs2_vs_cs3_non <- list(cs2_counts1_byhist, cs3_counts1_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Non"))
rand1_cs2_vs_cs3_norm <- list(cs2_norm_rand1_byhist, cs3_counts1_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Norm"))
rand1_cs2_vs_cs3 <-
inner_join(rand1_cs2_vs_cs3_non, rand1_cs2_vs_cs3_norm, by = "hist")
knitr::kable(rand1_cs2_vs_cs3,
caption = "Random1 CS2 vs. CS3 Median Concordance Measures by Histotypes",
digits = 2)
| hist | R2-Non | Ca-Non | Rc-Non | R2-Norm | Ca-Norm | Rc-Norm |
|---|---|---|---|---|---|---|
| CCOC | 1.00 | 0.20 | 0.04 | 1.00 | 0.27 | 0.09 |
| ENOC | 1.00 | 0.67 | 0.64 | 1.00 | 0.64 | 0.61 |
| HGSC | 0.83 | 0.96 | 0.86 | 0.83 | 0.99 | 0.89 |
| LGSC | 0.99 | 0.92 | 0.91 | 0.99 | 0.96 | 0.94 |
| MUC | 0.70 | 0.78 | 0.55 | 0.70 | 0.86 | 0.57 |
# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand <- hist %>%
filter(ottaID %in% unique(c(
cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
))) %>%
distinct(ottaID, revHist) %>%
filter(revHist == "HGSC") %>%
slice_sample(n = 3)
# Gene expression from random common samples, preserving gene order
cs1_rand <- join_avg(cs1_clean, hist_rand, "ottaID", "keep")
cs2_rand <- join_avg(cs2_clean, hist_rand, "ottaID", "keep")
cs3_rand <- join_avg(cs3_clean, hist_rand, "ottaID", "keep")
# Remove common samples from CS1, preserving gene order
cs1_counts <- join_avg(cs1_clean, hist_rand, "ottaID", "discard")
cs2_counts <- join_avg(cs2_clean, hist_rand, "ottaID", "discard")
cs3_counts <- join_avg(cs3_clean, hist_rand, "ottaID", "discard")
# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand <-
refMethod(cs1_counts, cs3_rand, cs1_rand) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
cs2_norm_rand <-
refMethod(cs2_counts, cs3_rand, cs2_rand) %>%
as.data.frame() %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID")
norm_rand <- list(cs1_norm_rand, cs2_norm_rand, cs3_counts) %>%
set_names(codesets) %>%
map_at(1:2, select, -"revHist")
metrics_rand <- all_codesets %>%
imap_dfr(~ {
pmap_dfr(norm_rand[.x], ~ cor_stats(.x, .y)) %>%
mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup()
p_rand <- ggplot(metrics_rand, aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
vjust = 1,
check_overlap = TRUE) +
facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "Random3 HGSC Normalized Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_rand)
Random3 HGSC Normalized Concordance Measure Distributions
cs1_norm_byhist <- cs1_norm_rand %>%
split(.$revHist) %>%
map(select, -"revHist")
cs2_norm_byhist <- cs2_norm_rand %>%
split(.$revHist) %>%
map(select, -"revHist")
cs1_byhist <- cs1_counts %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID") %>%
split(.$revHist) %>%
map(select, -"revHist")
cs2_byhist <- cs2_counts %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID") %>%
split(.$revHist) %>%
map(select, -"revHist")
cs3_byhist <- cs3_counts %>%
rownames_to_column("ottaID") %>%
inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
column_to_rownames("ottaID") %>%
split(.$revHist) %>%
map(select, -"revHist")
rand3_hgsc_cs1_vs_cs3_non <- list(cs1_byhist, cs3_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
.id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Non"))
rand3_hgsc_cs1_vs_cs3_norm <- list(cs1_norm_byhist, cs3_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
.id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Norm"))
rand3_hgsc_cs1_vs_cs3 <-
inner_join(rand3_hgsc_cs1_vs_cs3_non, rand3_hgsc_cs1_vs_cs3_norm, by = "hist")
knitr::kable(rand3_hgsc_cs1_vs_cs3, caption = "Random3 HGSC CS1 vs. CS3 Median Concordance Measures by Histotypes", digits = 2)
| hist | R2-Non | Ca-Non | Rc-Non | R2-Norm | Ca-Norm | Rc-Norm |
|---|---|---|---|---|---|---|
| CCOC | 0.67 | 0.58 | 0.29 | 0.67 | 0.63 | 0.26 |
| ENOC | 0.89 | 0.80 | 0.71 | 0.89 | 0.80 | 0.74 |
| HGSC | 0.77 | 0.98 | 0.85 | 0.77 | 0.99 | 0.87 |
| LGSC | 0.94 | 0.85 | 0.79 | 0.94 | 0.88 | 0.83 |
| MUC | 0.76 | 0.93 | 0.73 | 0.76 | 0.93 | 0.79 |
rand3_hgsc_cs2_vs_cs3_non <- list(cs2_byhist, cs3_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
.id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Non"))
rand3_hgsc_cs2_vs_cs3_norm <- list(cs2_norm_byhist, cs3_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
.id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup() %>%
rename_if(is.double, ~ paste0(., "-Norm"))
rand3_hgsc_cs2_vs_cs3 <-
inner_join(rand3_hgsc_cs2_vs_cs3_non, rand3_hgsc_cs2_vs_cs3_norm, by = "hist")
knitr::kable(rand3_hgsc_cs2_vs_cs3, caption = "Random3 HGSC CS2 vs. CS3 Median Concordance Measures by Histotypes", digits = 2)
| hist | R2-Non | Ca-Non | Rc-Non | R2-Norm | Ca-Norm | Rc-Norm |
|---|---|---|---|---|---|---|
| CCOC | 0.69 | 0.54 | 0.35 | 0.69 | 0.64 | 0.39 |
| ENOC | 0.85 | 0.77 | 0.66 | 0.85 | 0.86 | 0.76 |
| HGSC | 0.82 | 0.96 | 0.86 | 0.82 | 0.99 | 0.90 |
| LGSC | 0.98 | 0.95 | 0.92 | 0.98 | 0.94 | 0.92 |
| MUC | 0.77 | 0.89 | 0.73 | 0.77 | 0.92 | 0.70 |
CodeSet2 contains 12 ref pool samples (Pool 1 = 4, Pool 2 = 4, Pool 3 = 4). CodeSet3 contains 22 ref pool samples (Pool 1 = 12, Pool 2 = 5, Pool 3 = 5). n=86 common samples.
CodeSet2 is calibrated to CodeSet3 as follows:
X^2(norm) = X^2 - R^2 + R^3
X^3(norm) = X^3
# Pool set: pool samples from CS2
pool_samples <- cohorts %>%
filter(file_source == "cs2", cohort == "POOL-CTRL") %>%
pull(col_name)
cs2_pools <- select(cs2_norm, Name, all_of(pool_samples))
# Locked down reference pools weights
weights <- c("Pool1", "Pool2", "Pool3") %>%
set_names() %>%
map_dbl(~ mean(grepl(., names(ref_pools), ignore.case = TRUE)))
# Weighted mean gene expression for CS2 pools (using reference pools weights)
cs2_pools_mgx <- weights %>%
imap_dfc(~ .x * rowMeans(select(cs2_pools, matches(.y)))) %>%
transmute(Name = cs2_pools[["Name"]], cs2_exp = rowSums(.))
# Mean gene expression for CS3 reference pools
cs3_pools_mgx <-
enframe(rowMeans(ref_pools), name = "Name", value = "cs3_exp")
# Extract cs2 norm counts (not pools)
cs2_norm_counts <- cs2_norm %>% select(-one_of(names(cs2_pools)[-1]))
# Normalize each gene by adding batch effect (diff in mean gx)
cs2_normalized_data_pools <-
inner_join(cs2_pools_mgx, cs3_pools_mgx, by = "Name") %>%
transmute(Name, be = cs3_exp - cs2_exp) %>%
inner_join(cs2_norm_counts, by = "Name") %>%
gather(FileName, exp, -1:-4) %>%
transmute(Name = fct_inorder(Name), FileName, exp = be + exp) %>%
spread(Name, exp)
## Summary
# CS2 pools: 9 samples, 365 genes # dim(t(cs2_pools))
# CS3 pools: 22 samples, 513 genes # dim(t(ref_pools))
# CS2 normalized: 891 samples (903 original - 12 from pools), 136 common genes # dim(cs2_normalized_data_pools)
# Keep only common samples between codesets, average out duplicates
tmp2 <- cs2_norm_counts %>%
gather(FileName, exp, -1:-3) %>%
mutate(FileName = gsub("^X", "", FileName)) %>%
inner_join(hist, by = "FileName") %>%
filter(Name %in% names(cs2_normalized_data_pools)[-1]) %>%
mutate(Name = factor(Name, levels = names(cs2_normalized_data_pools)[-1])) %>%
select(Name, ottaID, revHist, exp) %>%
group_by(Name, ottaID, revHist) %>%
summarize(exp = mean(exp)) %>%
ungroup() %>%
spread(Name, exp)
## `summarise()` regrouping output by 'Name', 'ottaID' (override with `.groups` argument)
tmp3 <- cs3_norm %>%
select(-one_of(names(ref_pools))) %>%
gather(FileName, exp, -1:-3) %>%
mutate(FileName = gsub("^X", "", FileName)) %>%
inner_join(hist, by = "FileName") %>%
filter(Name %in% names(cs2_normalized_data_pools)[-1]) %>%
mutate(Name = factor(Name, levels = names(cs2_normalized_data_pools)[-1])) %>%
select(Name, ottaID, revHist, exp) %>%
group_by(Name, ottaID, revHist) %>%
summarize(exp = mean(exp)) %>%
ungroup() %>%
spread(Name, exp)
## `summarise()` regrouping output by 'Name', 'ottaID' (override with `.groups` argument)
cs2_common_non <- semi_join(tmp2, tmp3, by = "ottaID")
cs3_common_non <- semi_join(tmp3, tmp2, by = "ottaID")
# Extract cs2 common pools-normalized
cs2_common_pools <- cs2_normalized_data_pools %>%
mutate(FileName = gsub("^X", "", FileName)) %>%
inner_join(hist, by = "FileName") %>%
group_by(ottaID, revHist) %>%
summarize_if(is.double, mean) %>%
ungroup() %>%
semi_join(cs3_common_non, by = "ottaID")
# Combined gene expression
sets <- c("CS2Non", "CS2Pools", "CS3Non")
all_comps <- combn(sets, 2) %>%
as_tibble() %>%
set_names(map_chr(., paste, collapse = "_vs_"))
common_gx <-
set_names(list(cs2_common_non, cs2_common_pools, cs3_common_non), sets) %>%
map(column_to_rownames, "ottaID") %>%
map(select, -"revHist")
# Concordance measures for all genes averaged across samples
metrics_pools <- all_comps %>%
imap_dfr(~ {
pmap_dfr(common_gx[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
}) %>%
gather(key = "Metric", value = "Expression", -Sites) %>%
group_by(Sites, Metric) %>%
mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
ungroup() %>%
mutate(Sites = factor(Sites, levels = names(all_comps)))
p_cs2non_vs_cs2pools <-
ggplot(metrics_pools %>% filter(Sites == "CS2Non_vs_CS2Pools"),
aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 120, label = Median),
hjust = 0,
check_overlap = TRUE) +
facet_grid(cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "CS2Non vs. CS2Pools Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_cs2non_vs_cs2pools)
CS2Non vs. CS2Pools Concordance Measure Distributions
p_cs2non_vs_cs3 <-
ggplot(metrics_pools %>% filter(Sites == "CS2Non_vs_CS3Non"),
aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
check_overlap = TRUE) +
facet_grid(cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "CS2 Non-Normalized Pools vs. CS3 Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_cs2non_vs_cs3)
CS2 Non-Normalized Pools vs. CS3 Concordance Measure Distributions
p_cs2pools_vs_cs3 <-
ggplot(metrics_pools %>% filter(Sites == "CS2Pools_vs_CS3Non"),
aes(Expression)) +
geom_histogram(bins = 30, fill = "blue") +
geom_text(aes(x = 0, y = 40, label = Median),
hjust = 0,
check_overlap = TRUE) +
facet_grid(cols = vars(Metric), scales = "free_x") +
labs(y = "Count",
title = "CS2 Normalized Pools vs. CS3 Concordance Measure Distributions") +
theme_bw() +
theme(panel.grid.minor = element_blank())
print(p_cs2pools_vs_cs3)
CS2 Normalized Pools vs. CS3 Concordance Measure Distributions
cs2_common_non_byhist <- cs2_common_non %>%
split(.$revHist) %>%
map(column_to_rownames, "ottaID") %>%
map(select, -"revHist")
cs2_common_pools_byhist <- cs2_common_pools %>%
split(.$revHist) %>%
map(column_to_rownames, "ottaID") %>%
map(select, -"revHist")
cs3_common_non_byhist <- cs3_common_non %>%
split(.$revHist) %>%
map(column_to_rownames, "ottaID") %>%
map(select, -"revHist")
pools_cs2non_vs_cs3 <- list(cs2_common_non_byhist,
cs3_common_non_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup()
knitr::kable(pools_cs2non_vs_cs3,
caption = "Pools Non-Normalized CS2 vs. CS3 Median Concordance Measures by Histotypes",
digits = 2)
| hist | R2 | Ca | Rc |
|---|---|---|---|
| CCOC | 0.66 | 0.51 | 0.29 |
| ENOC | 0.87 | 0.74 | 0.64 |
| HGSC | 0.77 | 0.94 | 0.80 |
| LGSC | 0.98 | 0.94 | 0.92 |
| MUC | 0.75 | 0.86 | 0.69 |
pools_cs2norm_vs_cs3 <- list(cs2_common_pools_byhist,
cs3_common_non_byhist) %>%
transpose() %>%
map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
group_by(hist) %>%
summarize_if(is.double, median) %>%
ungroup()
knitr::kable(pools_cs2norm_vs_cs3,
caption = "Pools Normalized CS2 vs. CS3 Median Concordance Measures by Histotypes",
digits = 2)
| hist | R2 | Ca | Rc |
|---|---|---|---|
| CCOC | 0.66 | 0.62 | 0.31 |
| ENOC | 0.87 | 0.74 | 0.67 |
| HGSC | 0.77 | 0.94 | 0.82 |
| LGSC | 0.98 | 0.96 | 0.93 |
| MUC | 0.75 | 0.91 | 0.70 |
common_dist_all <- hist %>%
filter(ottaID %in% c(
cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
)) %>%
count(CodeSet, revHist) %>%
pivot_wider(names_from = "CodeSet", values_from = "n")
knitr::kable(common_dist_all, caption = "All Common Samples Histotype Distribution")
| revHist | CS1 | CS2 | CS3 |
|---|---|---|---|
| CCOC | 3 | 4 | 9 |
| ENOC | 4 | 4 | 9 |
| HGSC | 59 | 62 | 95 |
| LGSC | 7 | 5 | 8 |
| MUC | 7 | 5 | 11 |
common_dist_distinct <- hist %>%
filter(ottaID %in% c(
cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
)) %>%
distinct(ottaID, CodeSet, revHist) %>%
count(CodeSet, revHist) %>%
pivot_wider(names_from = "CodeSet", values_from = "n")
knitr::kable(common_dist_distinct, caption = "Distinct Common Samples Histotype Distribution")
| revHist | CS1 | CS2 | CS3 |
|---|---|---|---|
| CCOC | 3 | 3 | 3 |
| ENOC | 3 | 3 | 3 |
| HGSC | 57 | 57 | 57 |
| LGSC | 4 | 4 | 4 |
| MUC | 5 | 5 | 5 |